In [1]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import seaborn as sns

from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from statsmodels.tsa.seasonal import seasonal_decompose

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

Loading the datasets¶

In [2]:
# Loading the datasets and making the date the index of the dataframe
df = pd.read_csv("raw_toronto_weather_data.csv", index_col="date")
df = df.sort_values(by='date')
df.info()
df.head()
<class 'pandas.core.frame.DataFrame'>
Index: 10021 entries, 1996-12-30 to 2024-06-06
Data columns (total 71 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   max_temperature                10021 non-null  float64
 1   avg_hourly_temperature         10021 non-null  float64
 2   avg_temperature                10021 non-null  float64
 3   min_temperature                10021 non-null  float64
 4   max_humidex                    2823 non-null   float64
 5   min_windchill                  3308 non-null   float64
 6   max_relative_humidity          10021 non-null  int64  
 7   avg_hourly_relative_humidity   10021 non-null  float64
 8   avg_relative_humidity          10021 non-null  float64
 9   min_relative_humidity          10021 non-null  int64  
 10  max_dew_point                  10021 non-null  float64
 11  avg_hourly_dew_point           10021 non-null  float64
 12  avg_dew_point                  10021 non-null  float64
 13  min_dew_point                  10021 non-null  float64
 14  max_wind_speed                 10021 non-null  int64  
 15  avg_hourly_wind_speed          10021 non-null  float64
 16  avg_wind_speed                 10021 non-null  float64
 17  min_wind_speed                 10021 non-null  int64  
 18  max_wind_gust                  7084 non-null   float64
 19  wind_gust_dir_10s              7084 non-null   float64
 20  max_pressure_sea               10021 non-null  float64
 21  avg_hourly_pressure_sea        10021 non-null  float64
 22  avg_pressure_sea               10021 non-null  float64
 23  min_pressure_sea               10021 non-null  float64
 24  max_pressure_station           10021 non-null  float64
 25  avg_hourly_pressure_station    10021 non-null  float64
 26  avg_pressure_station           10021 non-null  float64
 27  min_pressure_station           10021 non-null  float64
 28  max_visibility                 10021 non-null  int64  
 29  avg_hourly_visibility          10021 non-null  float64
 30  avg_visibility                 10021 non-null  int64  
 31  min_visibility                 10021 non-null  int64  
 32  max_health_index               240 non-null    float64
 33  avg_hourly_health_index        240 non-null    float64
 34  avg_health_index               240 non-null    float64
 35  min_health_index               240 non-null    float64
 36  heatdegdays                    10021 non-null  float64
 37  cooldegdays                    10021 non-null  float64
 38  growdegdays_5                  10021 non-null  float64
 39  growdegdays_7                  10021 non-null  float64
 40  growdegdays_10                 10021 non-null  float64
 41  precipitation                  9995 non-null   float64
 42  rain                           9984 non-null   float64
 43  snow                           9994 non-null   float64
 44  snow_on_ground                 7020 non-null   float64
 45  sunrise_hhmm                   3810 non-null   object 
 46  sunrise_unixtime               3810 non-null   float64
 47  sunrise_f                      3810 non-null   float64
 48  sunset_hhmm                    3810 non-null   object 
 49  sunset_unixtime                3810 non-null   float64
 50  sunset_f                       3810 non-null   float64
 51  daylight                       3810 non-null   float64
 52  min_uv_forecast                3148 non-null   float64
 53  max_uv_forecast                3148 non-null   float64
 54  min_high_temperature_forecast  3810 non-null   float64
 55  max_high_temperature_forecast  3810 non-null   float64
 56  min_low_temperature_forecast   3810 non-null   float64
 57  max_low_temperature_forecast   3810 non-null   float64
 58  solar_radiation                0 non-null      float64
 59  max_cloud_cover_4              0 non-null      float64
 60  avg_hourly_cloud_cover_4       0 non-null      float64
 61  avg_cloud_cover_4              0 non-null      float64
 62  min_cloud_cover_4              0 non-null      float64
 63  max_cloud_cover_8              3894 non-null   float64
 64  avg_hourly_cloud_cover_8       3894 non-null   float64
 65  avg_cloud_cover_8              3894 non-null   float64
 66  min_cloud_cover_8              3894 non-null   float64
 67  max_cloud_cover_10             237 non-null    float64
 68  avg_hourly_cloud_cover_10      237 non-null    float64
 69  avg_cloud_cover_10             237 non-null    float64
 70  min_cloud_cover_10             237 non-null    float64
dtypes: float64(62), int64(7), object(2)
memory usage: 5.5+ MB
Out[2]:
max_temperature avg_hourly_temperature avg_temperature min_temperature max_humidex min_windchill max_relative_humidity avg_hourly_relative_humidity avg_relative_humidity min_relative_humidity ... avg_cloud_cover_4 min_cloud_cover_4 max_cloud_cover_8 avg_hourly_cloud_cover_8 avg_cloud_cover_8 min_cloud_cover_8 max_cloud_cover_10 avg_hourly_cloud_cover_10 avg_cloud_cover_10 min_cloud_cover_10
date
1996-12-30 -2.0 -5.63 -4.84 -7.7 NaN -15.0 86 75.9 71.0 56 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1996-12-31 -7.3 -11.12 -11.00 -14.7 NaN -23.0 90 77.2 77.5 65 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1997-01-01 -3.0 -7.96 -8.80 -14.6 NaN -22.0 98 91.4 91.0 84 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1997-01-02 4.1 0.95 0.44 -3.2 NaN -4.0 100 97.8 96.5 93 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1997-01-03 5.6 2.95 3.40 1.2 NaN NaN 100 90.8 91.0 82 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 71 columns

In [3]:
# Removing the first two rows of year 1996
df = df.drop(df.index[:2])  # Removes the first two rows
df.shape
Out[3]:
(10019, 71)

Preprocessing¶

  1. Handling missing values
  2. Cleaning the data
  3. Splitting the data into training and testing sets
In [4]:
# Finding missing rows
df[df.isnull().any(axis=1)]
Out[4]:
max_temperature avg_hourly_temperature avg_temperature min_temperature max_humidex min_windchill max_relative_humidity avg_hourly_relative_humidity avg_relative_humidity min_relative_humidity ... avg_cloud_cover_4 min_cloud_cover_4 max_cloud_cover_8 avg_hourly_cloud_cover_8 avg_cloud_cover_8 min_cloud_cover_8 max_cloud_cover_10 avg_hourly_cloud_cover_10 avg_cloud_cover_10 min_cloud_cover_10
date
1997-01-01 -3.0 -7.96 -8.80 -14.6 NaN -22.0 98 91.4 91.0 84 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1997-01-02 4.1 0.95 0.44 -3.2 NaN -4.0 100 97.8 96.5 93 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1997-01-03 5.6 2.95 3.40 1.2 NaN NaN 100 90.8 91.0 82 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1997-01-04 3.4 2.35 2.20 1.0 NaN NaN 100 91.6 91.0 82 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1997-01-05 10.1 3.48 4.20 -1.7 NaN -9.0 100 86.5 83.0 66 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2024-06-02 18.2 16.21 16.30 14.4 NaN NaN 100 82.8 74.5 49 ... NaN NaN 8.0 7.8 6.5 5.0 NaN NaN NaN NaN
2024-06-03 23.4 18.42 19.54 15.7 29.0 NaN 100 90.8 85.5 71 ... NaN NaN 8.0 4.5 4.0 0.0 NaN NaN NaN NaN
2024-06-04 26.8 21.17 21.05 15.3 33.0 NaN 99 75.8 79.5 60 ... NaN NaN 8.0 4.8 4.0 0.0 NaN NaN NaN NaN
2024-06-05 25.6 21.68 21.70 17.8 33.0 NaN 100 77.3 84.0 68 ... NaN NaN 8.0 5.8 4.0 0.0 NaN NaN NaN NaN
2024-06-06 26.7 21.31 21.10 15.5 28.0 NaN 100 68.8 68.5 37 ... NaN NaN 8.0 5.4 4.5 1.0 NaN NaN NaN NaN

10019 rows × 71 columns

In [5]:
# Checking for the number of missing values before data cleaning
df.isna().sum()
Out[5]:
max_temperature                 0
avg_hourly_temperature          0
avg_temperature                 0
min_temperature                 0
max_humidex                  7196
                             ... 
min_cloud_cover_8            6125
max_cloud_cover_10           9782
avg_hourly_cloud_cover_10    9782
avg_cloud_cover_10           9782
min_cloud_cover_10           9782
Length: 71, dtype: int64
In [6]:
# Calculating the percentage of null values  
null_pct =  round(df.isna().sum()*100 / len(df), 2)
null_pct
Out[6]:
max_temperature               0.00
avg_hourly_temperature        0.00
avg_temperature               0.00
min_temperature               0.00
max_humidex                  71.82
                             ...  
min_cloud_cover_8            61.13
max_cloud_cover_10           97.63
avg_hourly_cloud_cover_10    97.63
avg_cloud_cover_10           97.63
min_cloud_cover_10           97.63
Length: 71, dtype: float64
In [7]:
# Getting columns with less than 40% missing values
valid_cols = df.columns[null_pct < .40]
valid_cols
Out[7]:
Index(['max_temperature', 'avg_hourly_temperature', 'avg_temperature',
       'min_temperature', 'max_relative_humidity',
       'avg_hourly_relative_humidity', 'avg_relative_humidity',
       'min_relative_humidity', 'max_dew_point', 'avg_hourly_dew_point',
       'avg_dew_point', 'min_dew_point', 'max_wind_speed',
       'avg_hourly_wind_speed', 'avg_wind_speed', 'min_wind_speed',
       'max_pressure_sea', 'avg_hourly_pressure_sea', 'avg_pressure_sea',
       'min_pressure_sea', 'max_pressure_station',
       'avg_hourly_pressure_station', 'avg_pressure_station',
       'min_pressure_station', 'max_visibility', 'avg_hourly_visibility',
       'avg_visibility', 'min_visibility', 'heatdegdays', 'cooldegdays',
       'growdegdays_5', 'growdegdays_7', 'growdegdays_10', 'precipitation',
       'rain', 'snow'],
      dtype='object')
In [8]:
# Assigning the valid columns only to df
df = df[valid_cols].copy()
In [9]:
# Checking for the number of missing values before cleaning
df.isna().sum()
Out[9]:
max_temperature                  0
avg_hourly_temperature           0
avg_temperature                  0
min_temperature                  0
max_relative_humidity            0
avg_hourly_relative_humidity     0
avg_relative_humidity            0
min_relative_humidity            0
max_dew_point                    0
avg_hourly_dew_point             0
avg_dew_point                    0
min_dew_point                    0
max_wind_speed                   0
avg_hourly_wind_speed            0
avg_wind_speed                   0
min_wind_speed                   0
max_pressure_sea                 0
avg_hourly_pressure_sea          0
avg_pressure_sea                 0
min_pressure_sea                 0
max_pressure_station             0
avg_hourly_pressure_station      0
avg_pressure_station             0
min_pressure_station             0
max_visibility                   0
avg_hourly_visibility            0
avg_visibility                   0
min_visibility                   0
heatdegdays                      0
cooldegdays                      0
growdegdays_5                    0
growdegdays_7                    0
growdegdays_10                   0
precipitation                   26
rain                            37
snow                            27
dtype: int64
In [10]:
# Missing Data heatmap before cleaning
sns.heatmap(df.isna(), yticklabels=False, cbar=False, cmap='rocket')
plt.show()
In [11]:
# Filling in the missing values for precipitation, rain and snow
df = df.ffill()
In [12]:
# Checking for the number of missing values post-cleaning
df.isna().sum()
Out[12]:
max_temperature                 0
avg_hourly_temperature          0
avg_temperature                 0
min_temperature                 0
max_relative_humidity           0
avg_hourly_relative_humidity    0
avg_relative_humidity           0
min_relative_humidity           0
max_dew_point                   0
avg_hourly_dew_point            0
avg_dew_point                   0
min_dew_point                   0
max_wind_speed                  0
avg_hourly_wind_speed           0
avg_wind_speed                  0
min_wind_speed                  0
max_pressure_sea                0
avg_hourly_pressure_sea         0
avg_pressure_sea                0
min_pressure_sea                0
max_pressure_station            0
avg_hourly_pressure_station     0
avg_pressure_station            0
min_pressure_station            0
max_visibility                  0
avg_hourly_visibility           0
avg_visibility                  0
min_visibility                  0
heatdegdays                     0
cooldegdays                     0
growdegdays_5                   0
growdegdays_7                   0
growdegdays_10                  0
precipitation                   0
rain                            0
snow                            0
dtype: int64
In [13]:
# Getting a Statistical Summary of the data
df.describe(include="all")
Out[13]:
max_temperature avg_hourly_temperature avg_temperature min_temperature max_relative_humidity avg_hourly_relative_humidity avg_relative_humidity min_relative_humidity max_dew_point avg_hourly_dew_point ... avg_visibility min_visibility heatdegdays cooldegdays growdegdays_5 growdegdays_7 growdegdays_10 precipitation rain snow
count 10019.000000 10019.000000 10019.000000 10019.000000 10019.000000 10019.000000 10019.000000 10019.000000 10019.000000 10019.000000 ... 10019.000000 10019.000000 10019.000000 10019.000000 10019.000000 10019.000000 10019.000000 10019.000000 10019.000000 10019.000000
mean 13.630752 9.143915 9.010631 4.392035 85.401637 69.332628 69.030342 52.659048 6.523206 3.346003 ... 19789.799381 14947.509732 10.027927 1.038677 6.749805 5.573540 4.027478 2.160196 1.859158 0.320830
std 11.315122 10.480384 10.545357 10.042189 10.221874 12.286544 11.172906 14.556967 9.682561 9.960977 ... 5173.366838 8906.128122 9.247519 2.201631 7.208566 6.469344 5.293412 5.388539 5.191014 1.532879
min -19.100000 -21.950000 -22.300000 -26.300000 38.000000 27.700000 29.500000 13.000000 -24.800000 -28.100000 ... 2000.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 4.000000 0.930000 0.840000 -2.600000 79.000000 61.300000 61.500000 42.000000 -0.900000 -4.200000 ... 15250.000000 6400.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 14.000000 9.150000 9.050000 4.300000 87.000000 69.700000 69.500000 52.000000 6.800000 3.300000 ... 20700.000000 16100.000000 8.900000 0.000000 4.000000 2.000000 0.000000 0.000000 0.000000 0.000000
75% 23.600000 18.520000 18.390000 13.100000 93.000000 77.700000 77.000000 62.500000 14.800000 11.800000 ... 24100.000000 24100.000000 17.200000 0.400000 13.400000 11.400000 8.400000 1.400000 0.600000 0.000000
max 37.900000 31.700000 31.950000 26.300000 100.000000 100.000000 100.000000 100.000000 26.200000 24.400000 ... 52300.000000 25000.000000 40.300000 13.900000 26.900000 24.900000 21.900000 126.000000 126.000000 30.400000

8 rows × 36 columns

In [14]:
# Checking the data types of the attributes
df.dtypes
Out[14]:
max_temperature                 float64
avg_hourly_temperature          float64
avg_temperature                 float64
min_temperature                 float64
max_relative_humidity             int64
avg_hourly_relative_humidity    float64
avg_relative_humidity           float64
min_relative_humidity             int64
max_dew_point                   float64
avg_hourly_dew_point            float64
avg_dew_point                   float64
min_dew_point                   float64
max_wind_speed                    int64
avg_hourly_wind_speed           float64
avg_wind_speed                  float64
min_wind_speed                    int64
max_pressure_sea                float64
avg_hourly_pressure_sea         float64
avg_pressure_sea                float64
min_pressure_sea                float64
max_pressure_station            float64
avg_hourly_pressure_station     float64
avg_pressure_station            float64
min_pressure_station            float64
max_visibility                    int64
avg_hourly_visibility           float64
avg_visibility                    int64
min_visibility                    int64
heatdegdays                     float64
cooldegdays                     float64
growdegdays_5                   float64
growdegdays_7                   float64
growdegdays_10                  float64
precipitation                   float64
rain                            float64
snow                            float64
dtype: object
In [15]:
# Checking the datatype of the index (i.e. the dates)
df.index
Out[15]:
Index(['1997-01-01', '1997-01-02', '1997-01-03', '1997-01-04', '1997-01-05',
       '1997-01-06', '1997-01-07', '1997-01-08', '1997-01-09', '1997-01-10',
       ...
       '2024-05-28', '2024-05-29', '2024-05-30', '2024-05-31', '2024-06-01',
       '2024-06-02', '2024-06-03', '2024-06-04', '2024-06-05', '2024-06-06'],
      dtype='object', name='date', length=10019)
In [16]:
# Converting the index datatype from object to date
df.index = pd.to_datetime(df.index)
In [17]:
df.index
Out[17]:
DatetimeIndex(['1997-01-01', '1997-01-02', '1997-01-03', '1997-01-04',
               '1997-01-05', '1997-01-06', '1997-01-07', '1997-01-08',
               '1997-01-09', '1997-01-10',
               ...
               '2024-05-28', '2024-05-29', '2024-05-30', '2024-05-31',
               '2024-06-01', '2024-06-02', '2024-06-03', '2024-06-04',
               '2024-06-05', '2024-06-06'],
              dtype='datetime64[ns]', name='date', length=10019, freq=None)
In [18]:
# Checking the data according to the years
df.index.year.value_counts().sort_index()
Out[18]:
date
1997    365
1998    365
1999    365
2000    366
2001    365
2002    365
2003    365
2004    366
2005    365
2006    365
2007    365
2008    366
2009    365
2010    365
2011    365
2012    366
2013    365
2014    365
2015    365
2016    366
2017    365
2018    365
2019    365
2020    366
2021    365
2022    365
2023    365
2024    158
Name: count, dtype: int64
In [19]:
# Correlation heatmap
plt.figure(figsize=(30, 30))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()
In [20]:
# Histogram for temperature
plt.figure(figsize=(10, 6))
sns.histplot(df['avg_temperature'], bins=30, kde=True)
plt.title('Distribution of Average Temperature')
plt.xlabel('Temperature (°C)')
plt.ylabel('Frequency')
plt.show()
In [21]:
# Histogram for rain
plt.figure(figsize=(10, 6))
sns.histplot(df['rain'], bins=30, kde=True)
plt.title('Distribution of Rainfall')
plt.xlabel('Rain')
plt.ylabel('Frequency')
plt.show()
In [22]:
# Histogram for precipitation
plt.figure(figsize=(10, 6))
sns.histplot(df['precipitation'], bins=30, kde=True)
plt.title('Distribution of Precipitation')
plt.xlabel('Precipitation')
plt.ylabel('Frequency')
plt.show()
In [23]:
# Histogram for relative humidity
plt.figure(figsize=(10, 6))
sns.histplot(df['avg_relative_humidity'], bins=30, kde=True)
plt.title('Distribution of Relative Humudity')
plt.xlabel('Relative Humidity')
plt.ylabel('Frequency')
plt.show()
In [24]:
# Box plot for temperature by month
df['month'] = df.index.month
plt.figure(figsize=(14, 7))
sns.boxplot(x='month', y='avg_temperature', data=df)
plt.title('Monthly Average Temperature Distribution')
plt.xlabel('Month')
plt.ylabel('Temperature (°C)')
plt.show()
In [25]:
# Scatter plot for temperature vs. humidity
plt.figure(figsize=(10, 6))
sns.scatterplot(x='avg_temperature', y='avg_relative_humidity', data=df)
plt.title('Temperature vs. Humidity')
plt.xlabel('Average Temperature (°C)')
plt.ylabel('Average Relative Humidity (%)')
plt.show()
In [26]:
df['max_temperature'].plot()
Out[26]:
<Axes: xlabel='date'>
In [27]:
df['snow'].plot()
Out[27]:
<Axes: xlabel='date'>
In [28]:
df['precipitation'].plot()
Out[28]:
<Axes: xlabel='date'>
In [29]:
df['rain'].plot()
Out[29]:
<Axes: xlabel='date'>
In [30]:
df['cooldegdays'].plot()
Out[30]:
<Axes: xlabel='date'>
In [31]:
df['max_wind_speed'].plot()
Out[31]:
<Axes: xlabel='date'>
In [32]:
# Missing Data heatmap post-cleaning
sns.heatmap(df.isnull(), yticklabels=False, cbar=False, cmap='rocket')
plt.show()
In [33]:
# Correlation plot 
plt.figure(figsize=(30,30))
sns.heatmap(df.select_dtypes(include=['number']).corr(),annot=True,cmap='RdBu')

plt.title("Correlation",fontsize=40)
plt.show()
In [34]:
# Scatter Plot for max_temperature vs rain
sns.scatterplot(x='rain', y='max_temperature', data=df)
plt.title('max_temperature vs. rain')
plt.show()

# Pearson Correlation Coefficient
correlation, _ = pearsonr(df['rain'], df['max_temperature'])
print(f'Pearson correlation: {correlation}')
Pearson correlation: 0.10130268238898475
In [35]:
# Scatter Plot for max_tempeature vs avg_relative_humidity
sns.scatterplot(x='avg_relative_humidity', y='max_temperature', data=df)
plt.title('max_tempeature vs. avg_relative_humidity')
plt.show()

# Pearson Correlation Coefficient
correlation, _ = pearsonr(df['max_temperature'], df['avg_relative_humidity'])
print(f'Pearson correlation: {correlation}')
Pearson correlation: -0.18860673181518933
In [36]:
# Scatter Plot for max_temperature vs avg_dew_point
sns.scatterplot(x='max_temperature', y='avg_dew_point', data=df)
plt.title('max_temperature vs. avg_dew_point')
plt.show()

# Pearson Correlation Coefficient
correlation, _ = pearsonr(df['max_temperature'], df['avg_dew_point'])
print(f'Pearson correlation: {correlation}')
Pearson correlation: 0.9333212084360358
In [37]:
# Scatter Plot for rain vs avg_pressure_sea
sns.scatterplot(x='avg_pressure_sea', y='rain', data=df)
plt.title('rain vs. avg_pressure_sea')
plt.show()

# Pearson Correlation Coefficient
correlation, _ = pearsonr(df['avg_pressure_sea'], df['rain'])
print(f'Pearson correlation: {correlation}')
Pearson correlation: -0.27634540616698666
In [38]:
# Scatter Plot for max_temperature vs cooldegdays
sns.scatterplot(x='max_temperature', y='cooldegdays', data=df)
plt.title('max_temperature vs. cooldegdays')
plt.show()

# Pearson Correlation Coefficient
correlation, _ = pearsonr(df['max_temperature'], df['cooldegdays'])
print(f'Pearson correlation: {correlation}')
Pearson correlation: 0.6514285642421251

Modeling¶

In [39]:
# Setting the next day's max_temperature value as the target of the current day's features
df['y'] = df.shift(-1)['max_temperature']
In [40]:
df
Out[40]:
max_temperature avg_hourly_temperature avg_temperature min_temperature max_relative_humidity avg_hourly_relative_humidity avg_relative_humidity min_relative_humidity max_dew_point avg_hourly_dew_point ... heatdegdays cooldegdays growdegdays_5 growdegdays_7 growdegdays_10 precipitation rain snow month y
date
1997-01-01 -3.0 -7.96 -8.80 -14.6 98 91.4 91.0 84 -3.6 -9.1 ... 26.8 0.0 0.0 0.0 0.0 4.0 0.0 4.0 1 4.1
1997-01-02 4.1 0.95 0.44 -3.2 100 97.8 96.5 93 3.1 0.6 ... 17.6 0.0 0.0 0.0 0.0 1.4 1.4 0.0 1 5.6
1997-01-03 5.6 2.95 3.40 1.2 100 90.8 91.0 82 5.2 1.6 ... 14.6 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1 3.4
1997-01-04 3.4 2.35 2.20 1.0 100 91.6 91.0 82 3.3 1.1 ... 15.8 0.0 0.0 0.0 0.0 4.2 4.2 0.0 1 10.1
1997-01-05 10.1 3.48 4.20 -1.7 100 86.5 83.0 66 8.3 1.3 ... 13.8 0.0 0.0 0.0 0.0 3.0 3.0 0.0 1 -1.6
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2024-06-02 18.2 16.21 16.30 14.4 100 82.8 74.5 49 17.0 12.9 ... 1.7 0.0 11.3 9.3 6.3 7.0 7.0 0.0 6 23.4
2024-06-03 23.4 18.42 19.54 15.7 100 90.8 85.5 71 20.3 16.8 ... 0.0 1.5 14.5 12.5 9.5 0.0 0.0 0.0 6 26.8
2024-06-04 26.8 21.17 21.05 15.3 99 75.8 79.5 60 19.1 16.5 ... 0.0 3.0 16.1 14.1 11.1 0.0 0.0 0.0 6 25.6
2024-06-05 25.6 21.68 21.70 17.8 100 77.3 84.0 68 20.8 17.5 ... 0.0 3.7 16.7 14.7 11.7 3.2 3.2 0.0 6 26.7
2024-06-06 26.7 21.31 21.10 15.5 100 68.8 68.5 37 19.9 14.4 ... 0.0 3.1 16.1 14.1 11.1 0.8 0.8 0.0 6 NaN

10019 rows × 38 columns

In [41]:
# Handling the last row missing the target data
df = df.ffill()
In [42]:
df
Out[42]:
max_temperature avg_hourly_temperature avg_temperature min_temperature max_relative_humidity avg_hourly_relative_humidity avg_relative_humidity min_relative_humidity max_dew_point avg_hourly_dew_point ... heatdegdays cooldegdays growdegdays_5 growdegdays_7 growdegdays_10 precipitation rain snow month y
date
1997-01-01 -3.0 -7.96 -8.80 -14.6 98 91.4 91.0 84 -3.6 -9.1 ... 26.8 0.0 0.0 0.0 0.0 4.0 0.0 4.0 1 4.1
1997-01-02 4.1 0.95 0.44 -3.2 100 97.8 96.5 93 3.1 0.6 ... 17.6 0.0 0.0 0.0 0.0 1.4 1.4 0.0 1 5.6
1997-01-03 5.6 2.95 3.40 1.2 100 90.8 91.0 82 5.2 1.6 ... 14.6 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1 3.4
1997-01-04 3.4 2.35 2.20 1.0 100 91.6 91.0 82 3.3 1.1 ... 15.8 0.0 0.0 0.0 0.0 4.2 4.2 0.0 1 10.1
1997-01-05 10.1 3.48 4.20 -1.7 100 86.5 83.0 66 8.3 1.3 ... 13.8 0.0 0.0 0.0 0.0 3.0 3.0 0.0 1 -1.6
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2024-06-02 18.2 16.21 16.30 14.4 100 82.8 74.5 49 17.0 12.9 ... 1.7 0.0 11.3 9.3 6.3 7.0 7.0 0.0 6 23.4
2024-06-03 23.4 18.42 19.54 15.7 100 90.8 85.5 71 20.3 16.8 ... 0.0 1.5 14.5 12.5 9.5 0.0 0.0 0.0 6 26.8
2024-06-04 26.8 21.17 21.05 15.3 99 75.8 79.5 60 19.1 16.5 ... 0.0 3.0 16.1 14.1 11.1 0.0 0.0 0.0 6 25.6
2024-06-05 25.6 21.68 21.70 17.8 100 77.3 84.0 68 20.8 17.5 ... 0.0 3.7 16.7 14.7 11.7 3.2 3.2 0.0 6 26.7
2024-06-06 26.7 21.31 21.10 15.5 100 68.8 68.5 37 19.9 14.4 ... 0.0 3.1 16.1 14.1 11.1 0.8 0.8 0.0 6 26.7

10019 rows × 38 columns

Using the Ridge Regression Model¶

In [43]:
# Checking the correlation between the target and the features
df.corr()
Out[43]:
max_temperature avg_hourly_temperature avg_temperature min_temperature max_relative_humidity avg_hourly_relative_humidity avg_relative_humidity min_relative_humidity max_dew_point avg_hourly_dew_point ... heatdegdays cooldegdays growdegdays_5 growdegdays_7 growdegdays_10 precipitation rain snow month y
max_temperature 1.000000 0.989004 0.989004 0.950527 0.027333 -0.192890 -0.188607 -0.308716 0.933503 0.932883 ... -0.972667 0.651429 0.924878 0.901518 0.859081 0.027299 0.101303 -0.268868 0.270073 0.927880
avg_hourly_temperature 0.989004 1.000000 0.997628 0.981032 0.063368 -0.136254 -0.128796 -0.242207 0.949732 0.959909 ... -0.982135 0.652993 0.926195 0.903849 0.862624 0.051095 0.124014 -0.262688 0.296015 0.930555
avg_temperature 0.989004 0.997628 1.000000 0.986016 0.071547 -0.125816 -0.120829 -0.235720 0.950102 0.960796 ... -0.983774 0.657466 0.927268 0.904992 0.864283 0.057854 0.131066 -0.263320 0.295486 0.930185
min_temperature 0.950527 0.981032 0.986016 1.000000 0.119496 -0.046906 -0.041258 -0.147244 0.943749 0.966910 ... -0.970362 0.646838 0.905461 0.884988 0.847292 0.090772 0.161167 -0.250135 0.316347 0.908248
max_relative_humidity 0.027333 0.063368 0.071547 0.119496 1.000000 0.837028 0.857901 0.614731 0.307957 0.295843 ... -0.085951 -0.018506 0.012932 0.009350 0.003303 0.362622 0.338861 0.128348 0.184884 -0.024880
avg_hourly_relative_humidity -0.192890 -0.136254 -0.125816 -0.046906 0.837028 1.000000 0.971945 0.904234 0.121739 0.143392 ... 0.115042 -0.119546 -0.163160 -0.157998 -0.150033 0.407994 0.371475 0.178007 0.151327 -0.214626
avg_relative_humidity -0.188607 -0.128796 -0.120829 -0.041258 0.857901 0.971945 1.000000 0.932643 0.127504 0.143053 ... 0.109676 -0.118115 -0.158784 -0.153300 -0.145361 0.381645 0.345622 0.172243 0.142302 -0.210255
min_relative_humidity -0.308716 -0.242207 -0.235720 -0.147244 0.614731 0.904234 0.932643 1.000000 -0.020520 0.011855 ... 0.228715 -0.168318 -0.252824 -0.241889 -0.225457 0.331215 0.292603 0.174277 0.088616 -0.305284
max_dew_point 0.933503 0.949732 0.950102 0.943749 0.307957 0.121739 0.127504 -0.020520 1.000000 0.983736 ... -0.938001 0.610683 0.874464 0.852964 0.812578 0.168162 0.231609 -0.214667 0.323504 0.853536
avg_hourly_dew_point 0.932883 0.959909 0.960796 0.966910 0.295843 0.143392 0.143053 0.011855 0.983736 1.000000 ... -0.947515 0.621960 0.882097 0.861660 0.823253 0.157889 0.222395 -0.219858 0.340616 0.869052
avg_dew_point 0.933321 0.959152 0.961425 0.967738 0.292715 0.134383 0.138772 0.007479 0.985909 0.997046 ... -0.948272 0.621816 0.882576 0.862011 0.823427 0.152258 0.217584 -0.223922 0.342908 0.871293
min_dew_point 0.909369 0.943523 0.947528 0.965537 0.271016 0.142836 0.145795 0.033497 0.947615 0.984101 ... -0.933718 0.616420 0.867674 0.848510 0.812607 0.133528 0.198947 -0.226887 0.352276 0.865701
max_wind_speed -0.170897 -0.189302 -0.187364 -0.200973 -0.007199 -0.036928 -0.027482 -0.037131 -0.138182 -0.195519 ... 0.196811 -0.070648 -0.173766 -0.168656 -0.156746 0.109196 0.089280 0.079964 -0.107901 -0.296579
avg_hourly_wind_speed -0.271283 -0.275519 -0.274141 -0.270128 -0.071220 -0.075754 -0.055942 -0.035865 -0.243463 -0.291787 ... 0.281126 -0.132145 -0.251353 -0.243120 -0.226597 0.042786 0.013014 0.106016 -0.113260 -0.362065
avg_wind_speed -0.234867 -0.242977 -0.240522 -0.240544 -0.039014 -0.044383 -0.032139 -0.021939 -0.202055 -0.250661 ... 0.249283 -0.104861 -0.221058 -0.213714 -0.198479 0.079611 0.054756 0.094843 -0.104460 -0.341463
min_wind_speed -0.286538 -0.270834 -0.268147 -0.240347 -0.090977 -0.044662 -0.030822 0.016571 -0.264605 -0.278925 ... 0.272548 -0.139504 -0.243246 -0.233808 -0.216911 -0.007837 -0.034412 0.093283 -0.062148 -0.318070
max_pressure_sea -0.352729 -0.382000 -0.387217 -0.415880 -0.291259 -0.239766 -0.246548 -0.173945 -0.436057 -0.446047 ... 0.383054 -0.245844 -0.324440 -0.315708 -0.302972 -0.205508 -0.221732 0.032997 -0.039681 -0.241204
avg_hourly_pressure_sea -0.224253 -0.248058 -0.251930 -0.276491 -0.352745 -0.314724 -0.321792 -0.246272 -0.347237 -0.332467 ... 0.251730 -0.149402 -0.185669 -0.179202 -0.171062 -0.282842 -0.277438 -0.049833 0.001910 -0.088188
avg_pressure_sea -0.216687 -0.240394 -0.245286 -0.271059 -0.348941 -0.309051 -0.319080 -0.244782 -0.340101 -0.323652 ... 0.244250 -0.149032 -0.181043 -0.175183 -0.168063 -0.281398 -0.276345 -0.048853 0.000942 -0.087902
min_pressure_sea -0.071091 -0.087258 -0.091344 -0.111775 -0.363896 -0.338677 -0.350543 -0.282577 -0.217536 -0.178997 ... 0.093224 -0.046017 -0.032589 -0.029949 -0.028656 -0.319910 -0.296216 -0.117482 0.037430 0.059657
max_pressure_station -0.272381 -0.300447 -0.305794 -0.335392 -0.293845 -0.258518 -0.265030 -0.200500 -0.361824 -0.369887 ... 0.302440 -0.194458 -0.249536 -0.242748 -0.233554 -0.205386 -0.215306 0.009625 -0.011517 -0.161050
avg_hourly_pressure_station -0.140921 -0.164038 -0.168181 -0.194479 -0.353146 -0.331888 -0.338497 -0.271634 -0.269363 -0.253383 ... 0.168965 -0.095910 -0.108151 -0.103667 -0.099183 -0.283084 -0.271096 -0.074043 0.028890 -0.007683
avg_pressure_station -0.130336 -0.153374 -0.158430 -0.185926 -0.349135 -0.326247 -0.335916 -0.270489 -0.259186 -0.241601 ... 0.158468 -0.093336 -0.100598 -0.096787 -0.093449 -0.280606 -0.268740 -0.073835 0.029056 -0.004565
min_pressure_station 0.008058 -0.008145 -0.012425 -0.035199 -0.361371 -0.351776 -0.363189 -0.303763 -0.141988 -0.103687 ... 0.015444 0.005274 0.040781 0.041591 0.039458 -0.317510 -0.287747 -0.139426 0.061603 0.133479
max_visibility 0.030300 0.013857 0.011294 -0.010416 -0.117846 -0.202143 -0.201769 -0.226976 -0.031649 -0.039616 ... -0.018676 -0.024187 0.010421 0.005593 -0.001860 -0.099173 -0.084126 -0.064567 -0.001997 0.030510
avg_hourly_visibility 0.173586 0.139956 0.133159 0.084092 -0.470062 -0.628284 -0.606394 -0.600774 -0.016716 -0.022359 ... -0.141414 0.044135 0.136559 0.127395 0.112277 -0.412921 -0.327512 -0.346730 0.053943 0.202211
avg_visibility 0.217006 0.186833 0.179377 0.132254 -0.544138 -0.635329 -0.631286 -0.586968 0.003309 0.017349 ... -0.191061 0.056899 0.161484 0.149919 0.132229 -0.406746 -0.340652 -0.286284 0.066972 0.260355
min_visibility 0.238493 0.210828 0.203317 0.158327 -0.579204 -0.647268 -0.642739 -0.579926 0.018065 0.037956 ... -0.213574 0.076970 0.182922 0.171656 0.154454 -0.427977 -0.357954 -0.303580 0.078702 0.288759
heatdegdays -0.972667 -0.982135 -0.983774 -0.970362 -0.085951 0.115042 0.109676 0.228715 -0.938001 -0.947515 ... 1.000000 -0.511641 -0.871134 -0.838756 -0.780267 -0.061782 -0.138612 0.276821 -0.311740 -0.917773
cooldegdays 0.651429 0.652993 0.657466 0.646838 -0.018506 -0.119546 -0.118115 -0.168318 0.610683 0.621960 ... -0.511641 1.000000 0.782245 0.811555 0.862263 0.017254 0.045273 -0.098752 0.106086 0.600270
growdegdays_5 0.924878 0.926195 0.927268 0.905461 0.012932 -0.163160 -0.158784 -0.252824 0.874464 0.882097 ... -0.871134 0.782245 1.000000 0.995900 0.974675 0.030329 0.085744 -0.195889 0.206545 0.876579
growdegdays_7 0.901518 0.903849 0.904992 0.884988 0.009350 -0.157998 -0.153300 -0.241889 0.852964 0.861660 ... -0.838756 0.811555 0.995900 1.000000 0.989167 0.026295 0.077245 -0.180298 0.192357 0.854507
growdegdays_10 0.859081 0.862624 0.864283 0.847292 0.003303 -0.150033 -0.145361 -0.225457 0.812578 0.823253 ... -0.780267 0.862263 0.974675 0.989167 1.000000 0.021470 0.066414 -0.159261 0.171671 0.813289
precipitation 0.027299 0.051095 0.057854 0.090772 0.362622 0.407994 0.381645 0.331215 0.168162 0.157889 ... -0.061782 0.017254 0.030329 0.026295 0.021470 1.000000 0.959758 0.241716 0.003906 -0.007350
rain 0.101303 0.124014 0.131066 0.161167 0.338861 0.371475 0.345622 0.292603 0.231609 0.222395 ... -0.138612 0.045273 0.085744 0.077245 0.066414 0.959758 1.000000 -0.021995 0.034456 0.067095
snow -0.268868 -0.262688 -0.263320 -0.250135 0.128348 0.178007 0.172243 0.174277 -0.214667 -0.219858 ... 0.276821 -0.098752 -0.195889 -0.180298 -0.159261 0.241716 -0.021995 1.000000 -0.104762 -0.273316
month 0.270073 0.296015 0.295486 0.316347 0.184884 0.151327 0.142302 0.088616 0.323504 0.340616 ... -0.311740 0.106086 0.206545 0.192357 0.171671 0.003906 0.034456 -0.104762 1.000000 0.258859
y 0.927880 0.930555 0.930185 0.908248 -0.024880 -0.214626 -0.210255 -0.305284 0.853536 0.869052 ... -0.917773 0.600270 0.876579 0.854507 0.813289 -0.007350 0.067095 -0.273316 0.258859 1.000000

38 rows × 38 columns

In [44]:
# Initializing the Ridge Regression Model
ridge_reg = Ridge(alpha = .1)
In [45]:
# Creating a list of predictor columns
X = df.columns[~df.columns.isin(['y'])]
In [46]:
# Time-series Cross Validation
def backtest(df, model, X, start=3650, step=90):
    all_predictions = []
    
    for i in range(start, df.shape[0], step):
        train = df.iloc[:i,:]
        test = df.iloc[i:(i+step),:]
        
        model.fit(train[X], train['y'])
        
        preds = model.predict(test[X])
        
        preds = pd.Series(preds, index = test.index) # index stays the same as our test data
        combined = pd.concat([test['y'], preds], axis=1)
        
        combined.columns = ['actual', 'prediction']
        
        combined['diff'] = (combined['prediction'] - combined['actual']).abs()
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)
In [47]:
predictions = backtest(df, ridge_reg, X)
In [48]:
predictions
Out[48]:
actual prediction diff
date
2006-12-30 5.2 4.512077 0.687923
2006-12-31 9.2 5.348854 3.851146
2007-01-01 5.8 6.055589 0.255589
2007-01-02 8.0 8.376891 0.376891
2007-01-03 11.9 7.741514 4.158486
... ... ... ...
2024-06-02 23.4 21.161758 2.238242
2024-06-03 26.8 24.138726 2.661274
2024-06-04 25.6 26.653006 1.053006
2024-06-05 26.7 25.595051 1.104949
2024-06-06 26.7 22.887354 3.812646

6369 rows × 3 columns

In [49]:
# Generating the MSE
mean_squared_error(predictions['actual'], predictions['prediction'])
Out[49]:
12.789728514684702
In [50]:
# Generating the MAE
predictions['diff'].mean()

# mean_absolute_error(predictions['actual'], predictions['prediction'])
Out[50]:
2.7690746620485
In [51]:
# Generating R2
r2_score(predictions['actual'], predictions['prediction'])
Out[51]:
0.9010629886108817
In [52]:
# Checking the errors
predictions['diff'].round().value_counts().sort_index()
Out[52]:
diff
0.0      827
1.0     1444
2.0     1242
3.0      937
4.0      659
5.0      471
6.0      305
7.0      211
8.0      124
9.0       71
10.0      34
11.0      24
12.0      11
13.0       3
14.0       4
16.0       2
Name: count, dtype: int64
In [ ]: